/*
 * transition_matte.c -- replace alpha channel of track
 *
 * Copyright (C) 2003-2014 Meltytech, LLC
 * Author: Maksym Veremeyenko <verem@m1stereo.tv>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */

#include <framework/mlt.h>

#include <ctype.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#if defined(USE_SSE) && defined(ARCH_X86_64)
static void __attribute__((noinline))
copy_Y_to_A_scaled_luma_sse(uint8_t *alpha_a, uint8_t *image_b, int cnt)
{
    const static unsigned char const1[] = {43, 0, 43, 0, 43, 0, 43, 0, 43, 0, 43, 0, 43, 0, 43, 0};
    const static unsigned char const2[] = {16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0};
    const static unsigned char const3[]
        = {235, 0, 235, 0, 235, 0, 235, 0, 235, 0, 235, 0, 235, 0, 235, 0};
    const static unsigned char const4[]
        = {255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0};

    __asm__ volatile(
        "movdqu         (%[equ43]), %%xmm7      \n\t" /* load multiplier 43 */
        "movdqu         (%[equ16]), %%xmm6      \n\t" /* load bottom value 16 */
        "movdqu         (%[equ235]), %%xmm5     \n\t" /* load bottom value 235 */
        "movdqu         (%[equ255]), %%xmm4     \n\t" /* load bottom value 0xff */

        "loop_start:                            \n\t"

        /* load pixels block 1 */
        "movdqu         0(%[image_b]), %%xmm0   \n\t"
        "add            $0x10, %[image_b]       \n\t"

        /* load pixels block 2 */
        "movdqu         0(%[image_b]), %%xmm1   \n\t"
        "add            $0x10, %[image_b]       \n\t"

        /* leave only Y */
        "pand           %%xmm4, %%xmm0          \n\t"
        "pand           %%xmm4, %%xmm1          \n\t"

        /* upper range clip */
        "pminsw         %%xmm5, %%xmm0          \n\t"
        "pminsw         %%xmm5, %%xmm1          \n\t"

        /* upper range clip */
        "pmaxsw         %%xmm6, %%xmm0          \n\t"
        "pmaxsw         %%xmm6, %%xmm1          \n\t"

        /* upper range clip */
        "psubw          %%xmm6, %%xmm0          \n\t"
        "psubw          %%xmm6, %%xmm1          \n\t"

        /* duplicate values */
        "movdqa         %%xmm0,%%xmm2           \n\t"
        "movdqa         %%xmm1,%%xmm3           \n\t"

        /* regA = regA << 8 */
        "psllw          $8, %%xmm0              \n\t"
        "psllw          $8, %%xmm1              \n\t"

        /* regB = regB * 47 */
        "pmullw         %%xmm7, %%xmm2          \n\t"
        "pmullw         %%xmm7, %%xmm3          \n\t"

        /* regA = regA + regB */
        "paddw          %%xmm2, %%xmm0          \n\t"
        "paddw          %%xmm3, %%xmm1          \n\t"

        /* regA = regA >> 8 */
        "psrlw          $8, %%xmm0              \n\t"
        "psrlw          $8, %%xmm1              \n\t"

        /* pack to 8 bit value */
        "packuswb       %%xmm1, %%xmm0          \n\t"

        /* store */
        "movdqu         %%xmm0, (%[alpha_a])    \n\t"
        "add            $0x10, %[alpha_a]       \n\t"

        /* loop if we done */
        "dec            %[cnt]                  \n\t"
        "jnz            loop_start              \n\t"
        : [cnt] "+r"(cnt), [alpha_a] "+r"(alpha_a), [image_b] "+r"(image_b)
        : [equ43] "r"(const1), [equ16] "r"(const2), [equ235] "r"(const3), [equ255] "r"(const4)
        : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7");
};
#endif

static void copy_Y_to_A_scaled_luma(
    uint8_t *alpha_a, int stride_a, uint8_t *image_b, int stride_b, int width, int height)
{
    int i, j;

    for (j = 0; j < height; j++) {
        i = 0;
#if defined(USE_SSE) && defined(ARCH_X86_64)
        if (width >= 16) {
            copy_Y_to_A_scaled_luma_sse(alpha_a, image_b, width >> 4);
            i = (width >> 4) << 4;
        }
#endif
        for (; i < width; i++) {
            unsigned int p = image_b[2 * i];

            if (p < 16)
                p = 16;
            if (p > 235)
                p = 235;
            /* p = (p - 16) * 255 / 219; */
            p -= 16;
            p = ((p << 8) + (p * 43)) >> 8;

            alpha_a[i] = p;
        };

        alpha_a += stride_a;
        image_b += stride_b;
    };
};

/** Get the image.
*/

static int transition_get_image(mlt_frame a_frame,
                                uint8_t **image,
                                mlt_image_format *format,
                                int *width,
                                int *height,
                                int writable)
{
    // Get the b frame from the stack
    mlt_frame b_frame = mlt_frame_pop_frame(a_frame);

    mlt_frame_get_image(a_frame, image, format, width, height, 1);

    // Get the properties of the a frame
    mlt_properties a_props = MLT_FRAME_PROPERTIES(a_frame);

    int width_a = mlt_properties_get_int(a_props, "width"), width_b = width_a,
        height_a = mlt_properties_get_int(a_props, "height"), height_b = height_a;

    uint8_t *alpha_a, *image_b;

    // This transition is yuv422 only
    *format = mlt_image_yuv422;

    // Get the image from the a frame
    mlt_frame_get_image(b_frame, &image_b, format, &width_b, &height_b, 1);

    // Create the alpha channel based on the b_image dimensions
    int alpha_width = MIN(width_a, width_b);
    int alpha_height = MIN(height_a, height_b);
    int size = alpha_width * alpha_height;
    alpha_a = mlt_pool_alloc(size);
    memset(alpha_a, 255, size);
    mlt_frame_set_alpha(a_frame, alpha_a, size, mlt_pool_release);

    // Copy luma from image_b to a_frame's new alpha channel
    copy_Y_to_A_scaled_luma(alpha_a, width_a, image_b, width_b * 2, alpha_width, alpha_height);

    // Extract the a_frame image info
    *width = mlt_properties_get_int(a_props, "width");
    *height = mlt_properties_get_int(a_props, "height");
    *image = mlt_properties_get_data(a_props, "image", NULL);

    return 0;
}

/** Matte transition processing.
*/

static mlt_frame transition_process(mlt_transition transition, mlt_frame a_frame, mlt_frame b_frame)
{
    // Push the b_frame on to the stack
    mlt_frame_push_frame(a_frame, b_frame);

    // Push the transition method
    mlt_frame_push_get_image(a_frame, transition_get_image);

    return a_frame;
}

/** Constructor for the filter.
*/

mlt_transition transition_matte_init(mlt_profile profile,
                                     mlt_service_type type,
                                     const char *id,
                                     char *lumafile)
{
    mlt_transition transition = mlt_transition_new();
    if (transition != NULL) {
        // Set the methods
        transition->process = transition_process;

        // Inform apps and framework that this is a video only transition
        mlt_properties_set_int(MLT_TRANSITION_PROPERTIES(transition), "_transition_type", 1);

        return transition;
    }
    return NULL;
}
